spacy-streamlit 是一個讓 streamlit 可以顯示 displaCy 效果的開源專案。
首先用一張圖秀一下今天要介紹的內容有什麼效果~
使用 streamlit 對於現在的使用場景具備以下優點:
pip install spacy-streamlit
import stanza
import spacy_stanza
from ckip_transformers.nlp import CkipPosTagger, CkipNerChunker
import spacy
from spacy.matcher import DependencyMatcher
from spacy.tokens import Span
stanza.download("zh-hant")
nlp = spacy_stanza.load_pipeline("xx", lang='zh-hant')
這段程式碼的重點是
generate_doc(text: str)
會輸入一個新聞段落(text),return 一個處理好的 Spacy doc。
ef add_ner(doc):
ner_driver = CkipNerChunker(model="bert-base")
ner = ner_driver([str(doc)], show_progress=False)
ner_spans = []
for entity in ner[0]:
span = doc.char_span(entity.idx[0], entity.idx[1], label=entity.ner)
if span is None:
span = doc.char_span(entity.idx[0], entity.idx[1] + 1, label=entity.ner)
ner_spans.append(span)
orig_ents = list(doc.ents)
doc.ents = []
doc.ents = orig_ents + ner_spans
def add_ckip_tag(doc):
pos_driver = CkipPosTagger(model="bert-base")
words = [[str(token) for token in doc]]
pos = pos_driver(words, show_progress=False)
for token, ckip_pos in zip(doc, pos[0]):
token.tag_ = ckip_pos
pattern = [
{
"RIGHT_ID": "VE",
"RIGHT_ATTRS": {"TAG": "VE"}
},
{
"LEFT_ID": "VE",
"REL_OP": ">",
"RIGHT_ID": "who_root",
"RIGHT_ATTRS": {"DEP": "nsubj"}
},
{
"LEFT_ID": "VE",
"REL_OP": ">",
"RIGHT_ID": "idea_root",
"RIGHT_ATTRS": {"DEP": {"IN": ["ccomp", "parataxis"]}}
}
]
version = "v0"
matcher = DependencyMatcher(nlp.vocab, validate=True)
matcher.add(f"{version}", [pattern])
def generate_doc(text: str):
doc = nlp(text)
add_ner(doc)
add_ckip_tag(doc)
matches = matcher(doc)
matches_sorted = sorted(matches, key=lambda x: abs(x[1][0] - x[1][1]))
if len(matches_sorted) > 1:
matches_sorted = [match for match in matches_sorted if (match[1][0] == matches_sorted[0][1][0] and match[1][1] == matches_sorted[0][1][1])]
if len(matches_sorted) > 0:
first_match = matches_sorted[0]
VE_id = first_match[1][0]
who_root_id = first_match[1][1]
VE_span = Span(doc, VE_id, VE_id+1, label="VERB")
who_root_span = Span(doc, doc[who_root_id].left_edge.i, doc[who_root_id].right_edge.i+1, label="WHO")
idea_spans = []
for match in matches_sorted:
match_id, token_ids = match
idea_root_id = token_ids[2]
idea_spans.append(Span(doc, doc[idea_root_id].left_edge.i, doc[idea_root_id].right_edge.i+1, label="OPINION"))
doc.spans["sc"] = spacy.util.filter_spans([VE_span, who_root_span] + idea_spans)
else:
doc.spans["sc"] = []
return doc
import spacy_streamlit
import streamlit as st
DEFAULT_TEXT = """媒體關注數位部部長唐鳳對數位中介服務法看法,唐鳳表示,監理業務不屬於數位部範圍,面對大型跨境數位平台,最重要的是要確保現實世界中覺得合理的價值,平台上也應該符合相關的社會價值,遵守常規。"""
st.title("My cool app")
text = st.text_area("Text to analyze", DEFAULT_TEXT, height=200)
doc = generate_doc(text)
spacy_streamlit.visualize_spans(doc, spans_key="sc")
spacy_streamlit.visualize_spans(doc, spans_key="sc")
spacy_streamlit.visualize_parser(doc)
st.title("My cool app")
設定頁面標題text = st.text_area("Text to analyze", DEFAULT_TEXT, height=200)
取得使用者的文字輸入,並存入 text,預設值為 DEFAULT_TEXT。spacy_streamlit.visualize_spans(doc, spans_key="sc")
視覺化顯示 doc 分類為 "sc" 的 span,之後可以改顯示不同意見提取方法所產生的 span,例如類別名為 "rule_0", "rule_1" 的 span,用於比較不同方法所產生的結果。spacy_streamlit.visualize_parser(doc)
視覺化顯示 doc 的 dependency parsing 結果。在終端機中執行下方指令:
streamlit run 上方程式碼的檔名.py
接著會自動開啟 http://localhost:8501/,就可以看到結果了!